library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## get my transliteration table (I tried to guess the PsycInfo ASCII name from the PsycTESTS name)
translit <- readRDS("../sober_rubric/raw_data/psycinfo_psyctests_names.rds")

## get our first scrape (by journal, checking counts for each year in each journal for top tests)
psycinfo_scrape_by_journal <- read_tsv('../sober_rubric/raw_data/merged_table_all.tsv') %>% 
  drop_na(Name) %>% 
  # this tsv can be found in "Scraping-EBSCO-Host\data\merged tables"
#  mutate(Name = toTitleCase(Name)) %>% 
  rename(usage_count = "Hit Count") %>% 
  group_by(Name, Year) %>% 
  summarise(usage_count = sum(usage_count))
## Rows: 309223 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): Name, Journal
## dbl (3): Hit Count, Year, number of search results
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'Name'. You can override using the `.groups` argument.
## get our second scrape (by test DOI and year)
overview <- readr::read_tsv("../sober_rubric/raw_data/20230617_ebsco_scrape_clean_overview_table_1.tsv")
## Rows: 71692 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): DOI
## dbl (3): first_pub_year, last_pub_year, Hits
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
byyear <- readr::read_tsv("../sober_rubric/raw_data/20230617_ebsco_scrape_table_years_1.tsv")
## Rows: 218142 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): DOI
## dbl (2): Year, Hits
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
byyear %>% group_by(DOI) %>% summarise(Hits = sum(Hits, na.rm=T)) %>% pull(Hits) %>% table()
## .
##     0     1     2     3     4     5     6     7     8     9    10    11    12 
##    27 13280  4107  2140  1487  1077   864   645   570   464   375   375   285 
##    13    14    15    16    17    18    19    20    21    22    23    24    25 
##   243   237   220   168   180   163   114   141   132   102   108   113   108 
##    26    27    28    29    30    31    32    33    34    35    36    37    38 
##    83    91    72    86    88    68    81    77    68    61    45    56    48 
##    39    40    41    42    43    44    45    46    47    48    49    50    51 
##    42    60    48    37    45    38    42    41    34    29    29    33    35 
##    52    53    54    55    56    57    58    59    60    61    62    63    64 
##    26    31    25    21    22    32    19    37    26    23    18    24    16 
##    65    66    67    68    69    70    71    72    73    74    75    76    77 
##    25    19    19    22    19    27    18    18    11    12    12    16    11 
##    78    79    80    81    82    83    84    85    86    87    88    89    90 
##    15    22    16    14    10    13    16    10    13     6    10    13    11 
##    91    92    93    94    95    96    97    98    99   100   101   102   103 
##    10     8    13    14    11    10    17    12    11    10    13    12     6 
##   104   105   106   107   108   109   110   111   112   113   114   115   116 
##     8     8    13     9    13     8     6     9     6     7     8     4     5 
##   117   118   119   120   121   122   123   124   125   126   127   128   129 
##     5    13     8     7     7     6    10     9     7     3    13     4     4 
##   130   131   132   133   134   135   136   137   138   139   140   141   142 
##    11     6     4     3     6     5     7     3     6     4     3     8     7 
##   143   144   145   146   147   148   149   150   151   152   153   154   155 
##     9     9     4     8     3     9     4     7     9     6     5     5     3 
##   156   157   158   159   160   161   162   163   164   165   166   167   168 
##     6     5     5     5     4     6     3     3     4     3     3     5     1 
##   169   170   171   172   173   174   175   176   177   178   179   180   181 
##     2     5     3     3     3     3     5     2     2     2     4     8     5 
##   182   183   184   185   186   187   189   190   191   192   193   194   195 
##     4     4     6     5     2     1     3     5     6     1     6     4     5 
##   196   197   198   199   200   201   202   203   204   205   206   207   208 
##     4     4     1     1     3     3     5     1     3     3     3     5     2 
##   209   210   211   212   213   214   215   216   218   219   220   221   222 
##     5     3     7     1     3     4     2     3     4     3     3     4     1 
##   223   224   225   226   227   228   230   231   233   234   235   236   237 
##     2     6     4     1     1     3     1     4     2     3     2     2     1 
##   238   239   240   241   242   244   245   246   247   248   249   251   252 
##     1     4     6     2     1     1     4     4     1     1     1     2     1 
##   254   255   256   257   258   259   260   262   263   264   266   267   268 
##     1     2     3     1     2     3     3     4     3     1     1     2     1 
##   269   270   271   272   274   275   276   278   279   280   282   283   284 
##     2     2     1     3     3     1     2     4     4     2     2     2     2 
##   285   286   287   288   290   291   292   293   294   295   296   297   298 
##     2     1     2     1     1     2     1     3     3     1     2     2     2 
##   299   300   304   305   307   308   309   311   312   313   314   315   316 
##     3     1     1     1     1     4     1     1     1     1     1     3     2 
##   318   319   320   322   324   325   326   327   329   330   331   332   333 
##     1     3     4     2     1     2     1     1     2     1     2     4     1 
##   334   337   338   339   341   342   346   347   348   349   353   358   359 
##     1     1     1     1     1     1     2     1     1     1     1     3     2 
##   361   363   364   367   368   371   372   376   377   379   380   384   387 
##     2     1     2     1     1     2     1     1     2     1     1     2     2 
##   389   392   393   394   396   397   398   400   401   405   407   408   411 
##     1     1     1     1     1     2     1     2     1     2     2     1     1 
##   414   415   418   419   423   424   428   429   430   431   436   437   438 
##     1     1     1     1     1     1     1     1     1     2     1     1     2 
##   441   443   445   446   451   452   456   460   462   464   466   470   483 
##     3     2     1     2     1     1     1     1     1     1     2     1     1 
##   485   486   488   491   495   499   500   504   512   518   519   520   528 
##     1     1     1     1     1     1     3     1     1     1     1     1     2 
##   529   532   534   535   537   538   539   540   542   544   545   546   550 
##     1     1     1     1     1     1     1     1     1     2     1     1     1 
##   553   554   556   561   562   568   569   570   574   577   584   585   589 
##     1     1     1     1     1     1     1     1     2     1     1     1     1 
##   595   597   598   600   601   603   604   623   626   627   631   632   633 
##     1     1     1     1     1     1     1     1     1     1     2     1     1 
##   639   642   656   658   660   661   662   669   671   675   677   678   679 
##     1     2     1     1     1     1     1     1     1     1     1     1     1 
##   682   686   688   696   698   700   709   710   712   714   716   718   720 
##     1     1     1     1     1     1     1     1     1     1     2     2     1 
##   722   724   725   727   728   730   732   733   755   761   762   764   772 
##     1     1     1     1     2     1     1     1     1     1     1     1     1 
##   773   780   783   794   796   800   808   812   813   816   819   825   840 
##     1     1     1     1     2     1     2     1     1     2     1     1     1 
##   844   845   847   848   849   856   862   871   886   891   908   911   915 
##     1     1     2     1     1     1     1     1     1     1     2     1     1 
##   919   928   933   934   935   950   959   969   973   974   981   988   992 
##     1     1     2     1     2     1     1     2     2     1     1     1     1 
##   993  1009  1015  1018  1043  1071  1074  1077  1119  1121  1131  1135  1161 
##     1     1     1     1     1     1     1     1     1     1     1     1     1 
##  1163  1172  1173  1181  1184  1219  1224  1247  1251  1253  1255  1267  1296 
##     1     1     1     1     1     1     1     1     1     1     1     1     1 
##  1300  1323  1340  1378  1380  1392  1395  1399  1402  1429  1470  1479  1487 
##     1     1     1     1     1     1     1     1     1     1     1     1     1 
##  1519  1521  1553  1562  1569  1579  1642  1648  1688  1748  1772  1825  1868 
##     2     1     1     1     1     1     1     1     1     1     1     1     1 
##  1901  1932  1937  2052  2065  2074  2102  2121  2130  2132  2149  2200  2254 
##     1     1     1     1     1     1     1     1     1     1     1     1     1 
##  2304  2352  2584  2678  2700  2847  3053  3067  3134  3157  3487  3500  3637 
##     1     1     1     1     1     1     1     1     1     1     1     1     1 
##  3675  3750  3790  4041  4096  4410  4484  4876  4888  5147  6257  6313  6365 
##     1     1     1     1     1     1     1     1     1     1     1     1     1 
##  6408  6494  7023  7095  7238  7504  7597  8420  8513  8709  9492 10896 12134 
##     1     1     1     1     1     1     1     1     1     1     1     1     1 
## 13316 14268 18484 25118 
##     1     1     1     1
one_hit_wonders <- overview %>% filter(Hits == 1) %>% 
  mutate(Year = first_pub_year) %>% 
  mutate(Hits = coalesce(Hits, 1))
# for some few, the call was repeated by year for some reason
one_hit_wonders %>% select(DOI, first_pub_year) %>% inner_join(byyear, by = "DOI") %>% arrange(DOI)
byyear <- byyear %>% anti_join(one_hit_wonders, by = "DOI")

psycinfo_by_doi <- one_hit_wonders %>% 
  select(DOI, Year, Hits) %>% 
  bind_rows(byyear) %>% 
  left_join(overview %>% rename(total_hits = Hits), by = "DOI")


## don't use tests with names that occur many times
dupe_names <- translit %>% group_by(name_psycinfo) %>% filter(n() > 1) %>% ungroup()
translit <- translit %>% group_by(name_psycinfo) %>% 
  mutate(non_unique_name = n() > 1) %>% 
  filter(row_number() == 1) %>% ungroup()

# merge it all
psycinfo <- psycinfo_by_doi %>% 
  full_join(translit %>% select(DOI, name_psycinfo, NameOC), by = "DOI") %>% 
  full_join(psycinfo_scrape_by_journal, by = c("name_psycinfo" = "Name", "Year")) %>% 
  rename(hits_scrape_1 = usage_count,
         hits_scrape_2 = Hits,
         total_hits_scrape_2 = total_hits) %>% 
  group_by(name_psycinfo) %>% 
  mutate(total_hits_scrape_1 = sum(hits_scrape_1))
psycinfo %>% is.na() %>% colSums()
##                 DOI                Year       hits_scrape_2      first_pub_year 
##               96747               39022              135768              135768 
##       last_pub_year total_hits_scrape_2       name_psycinfo              NameOC 
##              135768              135768                3079               99825 
##       hits_scrape_1 total_hits_scrape_1 
##              218121              265989
## aggregate it all
psycinfo_overall <- psycinfo %>% 
  group_by(name_psycinfo) %>% 
  summarise(total_hits_scrape_1 = sum(hits_scrape_1, na.rm = T),
         total_hits_scrape_2 = sum(hits_scrape_2, na.rm = T)) %>% 
  left_join(translit %>% select(DOI, name_psycinfo))
## Joining with `by = join_by(name_psycinfo)`
## correlate totals
cor.test(psycinfo_overall$total_hits_scrape_1, psycinfo_overall$total_hits_scrape_2)
## 
##  Pearson's product-moment correlation
## 
## data:  psycinfo_overall$total_hits_scrape_1 and psycinfo_overall$total_hits_scrape_2
## t = 249.62, df = 104320, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.6076982 0.6152964
## sample estimates:
##       cor 
## 0.6115114
psycinfo_overall %>% 
  filter(total_hits_scrape_1 > 0, total_hits_scrape_2 > 0) %>% 
  summarise(cor(total_hits_scrape_1, total_hits_scrape_2))
## correlate by year, diffs, proportions
cor.test(psycinfo$hits_scrape_1, psycinfo$hits_scrape_2)
## 
##  Pearson's product-moment correlation
## 
## data:  psycinfo$hits_scrape_1 and psycinfo$hits_scrape_2
## t = 467.52, df = 39014, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9196473 0.9226533
## sample estimates:
##      cor 
## 0.921164
psycinfo %>%  mutate(diff = hits_scrape_2 - hits_scrape_1) %>% pull(diff) %>% abs() %>% mean(na.rm=T)
## [1] 12.3914
psycinfo %>%  mutate(prop = hits_scrape_2/ hits_scrape_1) %>% pull(prop) %>%  qplot() + scale_x_log10()
## Warning: `qplot()` was deprecated in ggplot2 3.4.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning in scale_x_log10(): log-10 transformation introduced infinite values.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 318095 rows containing non-finite outside the scale range
## (`stat_bin()`).

psycinfo %>%  mutate(diff = hits_scrape_2 - hits_scrape_1) %>% pull(diff) %>%  mean(na.rm=T)
## [1] 11.99798
# psycinfo %>% filter(hits_scrape_1 > hits_scrape_2) %>% select(DOI, Year, name_psycinfo, NameOC, hits_scrape_1, hits_scrape_2) %>% mutate(diff = hits_scrape_2 - hits_scrape_1) %>% arrange(diff) %>% View()

psycinfo %>% filter(hits_scrape_1 < hits_scrape_2) %>% nrow()
## [1] 27545
psycinfo %>%  mutate(diff = hits_scrape_2 - hits_scrape_1) %>% pull(diff) %>% table() %>% sort()
## .
## -165 -143  -99  -98  -84  -81  -73  -50  -43  -41  -39  -35  -31  -27  -23  -21 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
##  -19  -15  -13  140  143  154  179  182  186  190  195  206  216  226  228  233 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
##  239  241  243  246  248  250  253  257  258  260  262  263  265  268  269  274 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
##  278  281  284  285  287  290  293  294  298  301  302  307  311  312  313  316 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
##  319  325  326  327  328  332  334  335  337  340  344  347  350  351  355  358 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
##  359  365  376  379  381  383  394  396  398  400  404  406  410  413  414  416 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
##  417  418  421  428  429  430  432  433  434  437  439  441  443  446  449  460 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
##  462  466  474  490  493  495  496  502  510  511  512  516  526  531  539  553 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
##  562  563  567  571  577  586  590  602  604  613  633  639  640  644  655  659 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
##  661  683  691  700  701  704  714  736  765  771  775  791  804  806  828  854 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
##  858  865  879  919  950  955  965  966  976 1004 1005 1265 1335 1591  -96  -17 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    2    2 
##  -16  -12  105  119  135  136  141  153  157  159  160  162  164  165  167  169 
##    2    2    2    2    2    2    2    2    2    2    2    2    2    2    2    2 
##  172  173  174  180  183  191  197  198  200  207  211  217  218  225  232  235 
##    2    2    2    2    2    2    2    2    2    2    2    2    2    2    2    2 
##  236  238  244  256  261  267  270  272  273  282  288  295  304  305  306  317 
##    2    2    2    2    2    2    2    2    2    2    2    2    2    2    2    2 
##  318  322  339  342  346  349  352  369  373  375  380  385  392  407  408  431 
##    2    2    2    2    2    2    2    2    2    2    2    2    2    2    2    2 
##  436  438  440  450  456  548  680  -11   -9   99  118  138  146  158  171  177 
##    2    2    2    2    2    2    2    3    3    3    3    3    3    3    3    3 
##  178  185  189  192  196  199  202  204  205  208  215  219  220  222  223  234 
##    3    3    3    3    3    3    3    3    3    3    3    3    3    3    3    3 
##  247  254  264  275  279  286  297  303  309  329  336  356  367  374  382  537 
##    3    3    3    3    3    3    3    3    3    3    3    3    3    3    3    3 
##  -14  -10  132  134  142  144  145  148  150  163  170  176  187  188  193  194 
##    4    4    4    4    4    4    4    4    4    4    4    4    4    4    4    4 
##  224  255  366   -8   97  126  129  139  149  152  155  156  161  166  168  175 
##    4    4    4    5    5    5    5    5    5    5    5    5    5    5    5    5 
##  181  184  209  229  231  107  109  111  116  125  130  151  103  104  113  117 
##    5    5    5    5    5    6    6    6    6    6    6    6    7    7    7    7 
##  120  121  127  133  137  147   89  115  123   -7  110  112  114  124  131   86 
##    7    7    7    7    7    7    8    8    8    9    9    9    9    9    9   10 
##   90   92  100  101  108  122  128   74   87   88   95   96  102   91   93   94 
##   10   10   10   10   10   10   10   11   11   11   11   11   11   12   12   12 
##  106   77   85   98   63   82   72   80   81   83   84   76   70   73   79   -6 
##   12   13   14   14   15   15   16   16   16   16   16   17   18   18   19   20 
##   65   64   69   75   78   60   68   71   66   67   62   -5   59   61   55   57 
##   20   21   21   22   23   24   24   24   26   28   29   30   31   32   33   33 
##   58   50   53   56   46   52   54   43   40   48   51   49   45   42   44   41 
##   34   35   35   39   43   45   45   46   47   47   48   50   52   55   58   59 
##   47   39   38   -4   37   34   36   35   33   32   30   29   31   27   28   26 
##   61   62   69   70   76   77   77   85   96  101  102  107  108  112  123  142 
##   25   23   -3   24   22   21   20   19   18   17   16   15   14   13   12   11 
##  157  163  165  166  183  200  225  248  268  285  311  357  383  431  544  591 
##   -2   10    9    8    7    6    5    4    3    2   -1    1    0 
##  615  688  765  933 1066 1228 1589 2033 2638 3487 3759 4918 6757
# psycinfo %>% filter(hits_scrape_1 < hits_scrape_2) %>% select(DOI, Year, name_psycinfo, NameOC, hits_scrape_1, hits_scrape_2) %>% mutate(diff = hits_scrape_2 - hits_scrape_1) %>% arrange(diff) %>% View()

Top Tests in each

Only in PsycInfo Scrape 1

psycinfo_overall %>% 
  ungroup() %>% 
  filter(total_hits_scrape_1 > 0,
         total_hits_scrape_2 == 0) %>% 
  summarise(n(), sum(total_hits_scrape_1), sum(total_hits_scrape_1)/n())
options(cols.min.print = 2, cols.print = 2)
psycinfo_overall %>% 
  ungroup() %>% 
  # filter(is.na(DOI)) %>%
  filter(total_hits_scrape_2 == 0, total_hits_scrape_1 >= 1) %>% 
  arrange(desc(total_hits_scrape_1)) %>% 
  select(name_psycinfo, total_hits_scrape_1) %>% 
  arrange(desc(total_hits_scrape_1)) %>% 
  DT::datatable()
## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html

Only in PsycTests Scrape 2

psycinfo_overall %>% 
  ungroup() %>% 
  filter(total_hits_scrape_1 == 0,
         total_hits_scrape_2 > 0) %>% 
  summarise(n(), sum(total_hits_scrape_2), sum(total_hits_scrape_2)/n())
psycinfo_overall %>% 
  ungroup() %>% 
  filter(total_hits_scrape_1 == 0, total_hits_scrape_2 >= 1) %>% 
  # filter(!is.na(DOI), is.na(total_hits_scrape_1) | total_hits_scrape_1 == 0) %>% 
  drop_na(name_psycinfo, total_hits_scrape_2) %>% 
  arrange(desc(total_hits_scrape_2)) %>% 
  select( name_psycinfo, total_hits_scrape_2) %>% 
  DT::datatable()
## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html

Hits only in scrape 1, even though we have a match for the name

psycinfo_overall %>% 
  ungroup() %>% 
  filter(!is.na(DOI),
         total_hits_scrape_1 > 0,
         total_hits_scrape_2 == 0) %>% 
  summarise(n(), sum(total_hits_scrape_1), sum(total_hits_scrape_1)/n())

Hits only in scrape 1 without a clear match for the name

psycinfo_overall %>% 
  ungroup() %>% 
  filter(is.na(DOI),
         total_hits_scrape_1 > 0,
         total_hits_scrape_2 == 0) %>% 
  summarise(n(), sum(total_hits_scrape_1), sum(total_hits_scrape_1)/n())

Merge Scrape 1 and 2

psycinfo_scrape_1_without_hits_in_2 <- psycinfo_overall %>% 
    ungroup() %>% 
    filter(total_hits_scrape_1 > 0, is.na(total_hits_scrape_2) | total_hits_scrape_2 == 0) %>% 
    select(DOI, name_psycinfo) %>% 
    distinct(name_psycinfo, .keep_all = TRUE) %>% 
    left_join(psycinfo_scrape_by_journal %>% 
     rename(name_psycinfo = Name, Hits = usage_count), by = "name_psycinfo", multiple = "all") %>% 
    mutate(DOI = coalesce(DOI, name_psycinfo)) %>% 
    group_by(DOI) %>% 
    mutate(first_pub_year = min(Year, na.rm = T),
           last_pub_year = max(Year, na.rm = T),
           total_hits = sum(Hits, na.rm = T)) %>% 
  ungroup()

psycinfo_scrape_1_without_hits_in_2 %>% 
  summarise(n_distinct(DOI), sum(Hits), sum(Hits)/n_distinct(DOI))
psycinfo_by_doi_with_hits <- psycinfo_by_doi %>%
  drop_na(Hits, Year) %>% 
  anti_join(psycinfo_overall %>% filter(total_hits_scrape_2 == 0) %>% select(DOI), by = "DOI") %>% 
  left_join(translit %>% select(DOI, name_psycinfo), by = "DOI")
sum(is.na(psycinfo_by_doi_with_hits$name_psycinfo))
## [1] 3078
sum(!is.na(psycinfo_by_doi_with_hits$name_psycinfo))
## [1] 215037
psycinfo_by_doi_with_hits %>% 
  summarise(n_distinct(DOI), sum(Hits, na.rm = T), sum(Hits, na.rm = T)/n_distinct(DOI))
psycinfo_merged <- bind_rows(
  scrape_2 = psycinfo_by_doi_with_hits, 
  scrape_1 = psycinfo_scrape_1_without_hits_in_2, .id = "source")

psycinfo_merged %>% 
  summarise(n_distinct(DOI), sum(Hits, na.rm = T), sum(Hits, na.rm = T)/n_distinct(DOI))
saveRDS(psycinfo_merged, "../sober_rubric/raw_data/psycinfo_merged_scrape_1_and_2.rds")

Joint top list

psycinfo_merged %>% 
  group_by(DOI, name_psycinfo, source) %>%
  summarise(total_hits = sum(Hits, na.rm  = T)) %>% 
  arrange(desc(total_hits)) %>% 
  ungroup() %>% 
  select( source, name_psycinfo, total_hits) %>% 
  DT::datatable()
## `summarise()` has grouped output by 'DOI', 'name_psycinfo'. You can override
## using the `.groups` argument.
## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html
LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OgogIGh0bWxfZG9jdW1lbnQ6CiAgICBkZl9wcmludDogcGFnZWQKICAgIHRvYzogdHJ1ZQogICAgdG9jX2Zsb2F0OiB0cnVlCmVkaXRvcl9vcHRpb25zOgogIGNodW5rX291dHB1dF90eXBlOiBpbmxpbmUKLS0tCgpgYGB7cn0KbGlicmFyeSh0aWR5dmVyc2UpCiMjIGdldCBteSB0cmFuc2xpdGVyYXRpb24gdGFibGUgKEkgdHJpZWQgdG8gZ3Vlc3MgdGhlIFBzeWNJbmZvIEFTQ0lJIG5hbWUgZnJvbSB0aGUgUHN5Y1RFU1RTIG5hbWUpCnRyYW5zbGl0IDwtIHJlYWRSRFMoIi4uL3NvYmVyX3J1YnJpYy9yYXdfZGF0YS9wc3ljaW5mb19wc3ljdGVzdHNfbmFtZXMucmRzIikKCiMjIGdldCBvdXIgZmlyc3Qgc2NyYXBlIChieSBqb3VybmFsLCBjaGVja2luZyBjb3VudHMgZm9yIGVhY2ggeWVhciBpbiBlYWNoIGpvdXJuYWwgZm9yIHRvcCB0ZXN0cykKcHN5Y2luZm9fc2NyYXBlX2J5X2pvdXJuYWwgPC0gcmVhZF90c3YoJy4uL3NvYmVyX3J1YnJpYy9yYXdfZGF0YS9tZXJnZWRfdGFibGVfYWxsLnRzdicpICU+JSAKICBkcm9wX25hKE5hbWUpICU+JSAKICAjIHRoaXMgdHN2IGNhbiBiZSBmb3VuZCBpbiAiU2NyYXBpbmctRUJTQ08tSG9zdFxkYXRhXG1lcmdlZCB0YWJsZXMiCiMgIG11dGF0ZShOYW1lID0gdG9UaXRsZUNhc2UoTmFtZSkpICU+JSAKICByZW5hbWUodXNhZ2VfY291bnQgPSAiSGl0IENvdW50IikgJT4lIAogIGdyb3VwX2J5KE5hbWUsIFllYXIpICU+JSAKICBzdW1tYXJpc2UodXNhZ2VfY291bnQgPSBzdW0odXNhZ2VfY291bnQpKQoKIyMgZ2V0IG91ciBzZWNvbmQgc2NyYXBlIChieSB0ZXN0IERPSSBhbmQgeWVhcikKb3ZlcnZpZXcgPC0gcmVhZHI6OnJlYWRfdHN2KCIuLi9zb2Jlcl9ydWJyaWMvcmF3X2RhdGEvMjAyMzA2MTdfZWJzY29fc2NyYXBlX2NsZWFuX292ZXJ2aWV3X3RhYmxlXzEudHN2IikKYnl5ZWFyIDwtIHJlYWRyOjpyZWFkX3RzdigiLi4vc29iZXJfcnVicmljL3Jhd19kYXRhLzIwMjMwNjE3X2Vic2NvX3NjcmFwZV90YWJsZV95ZWFyc18xLnRzdiIpCmJ5eWVhciAlPiUgZ3JvdXBfYnkoRE9JKSAlPiUgc3VtbWFyaXNlKEhpdHMgPSBzdW0oSGl0cywgbmEucm09VCkpICU+JSBwdWxsKEhpdHMpICU+JSB0YWJsZSgpCgpvbmVfaGl0X3dvbmRlcnMgPC0gb3ZlcnZpZXcgJT4lIGZpbHRlcihIaXRzID09IDEpICU+JSAKICBtdXRhdGUoWWVhciA9IGZpcnN0X3B1Yl95ZWFyKSAlPiUgCiAgbXV0YXRlKEhpdHMgPSBjb2FsZXNjZShIaXRzLCAxKSkKIyBmb3Igc29tZSBmZXcsIHRoZSBjYWxsIHdhcyByZXBlYXRlZCBieSB5ZWFyIGZvciBzb21lIHJlYXNvbgpvbmVfaGl0X3dvbmRlcnMgJT4lIHNlbGVjdChET0ksIGZpcnN0X3B1Yl95ZWFyKSAlPiUgaW5uZXJfam9pbihieXllYXIsIGJ5ID0gIkRPSSIpICU+JSBhcnJhbmdlKERPSSkKCmJ5eWVhciA8LSBieXllYXIgJT4lIGFudGlfam9pbihvbmVfaGl0X3dvbmRlcnMsIGJ5ID0gIkRPSSIpCgpwc3ljaW5mb19ieV9kb2kgPC0gb25lX2hpdF93b25kZXJzICU+JSAKICBzZWxlY3QoRE9JLCBZZWFyLCBIaXRzKSAlPiUgCiAgYmluZF9yb3dzKGJ5eWVhcikgJT4lIAogIGxlZnRfam9pbihvdmVydmlldyAlPiUgcmVuYW1lKHRvdGFsX2hpdHMgPSBIaXRzKSwgYnkgPSAiRE9JIikKCgojIyBkb24ndCB1c2UgdGVzdHMgd2l0aCBuYW1lcyB0aGF0IG9jY3VyIG1hbnkgdGltZXMKZHVwZV9uYW1lcyA8LSB0cmFuc2xpdCAlPiUgZ3JvdXBfYnkobmFtZV9wc3ljaW5mbykgJT4lIGZpbHRlcihuKCkgPiAxKSAlPiUgdW5ncm91cCgpCnRyYW5zbGl0IDwtIHRyYW5zbGl0ICU+JSBncm91cF9ieShuYW1lX3BzeWNpbmZvKSAlPiUgCiAgbXV0YXRlKG5vbl91bmlxdWVfbmFtZSA9IG4oKSA+IDEpICU+JSAKICBmaWx0ZXIocm93X251bWJlcigpID09IDEpICU+JSB1bmdyb3VwKCkKCiMgbWVyZ2UgaXQgYWxsCnBzeWNpbmZvIDwtIHBzeWNpbmZvX2J5X2RvaSAlPiUgCiAgZnVsbF9qb2luKHRyYW5zbGl0ICU+JSBzZWxlY3QoRE9JLCBuYW1lX3BzeWNpbmZvLCBOYW1lT0MpLCBieSA9ICJET0kiKSAlPiUgCiAgZnVsbF9qb2luKHBzeWNpbmZvX3NjcmFwZV9ieV9qb3VybmFsLCBieSA9IGMoIm5hbWVfcHN5Y2luZm8iID0gIk5hbWUiLCAiWWVhciIpKSAlPiUgCiAgcmVuYW1lKGhpdHNfc2NyYXBlXzEgPSB1c2FnZV9jb3VudCwKICAgICAgICAgaGl0c19zY3JhcGVfMiA9IEhpdHMsCiAgICAgICAgIHRvdGFsX2hpdHNfc2NyYXBlXzIgPSB0b3RhbF9oaXRzKSAlPiUgCiAgZ3JvdXBfYnkobmFtZV9wc3ljaW5mbykgJT4lIAogIG11dGF0ZSh0b3RhbF9oaXRzX3NjcmFwZV8xID0gc3VtKGhpdHNfc2NyYXBlXzEpKQpwc3ljaW5mbyAlPiUgaXMubmEoKSAlPiUgY29sU3VtcygpCgojIyBhZ2dyZWdhdGUgaXQgYWxsCnBzeWNpbmZvX292ZXJhbGwgPC0gcHN5Y2luZm8gJT4lIAogIGdyb3VwX2J5KG5hbWVfcHN5Y2luZm8pICU+JSAKICBzdW1tYXJpc2UodG90YWxfaGl0c19zY3JhcGVfMSA9IHN1bShoaXRzX3NjcmFwZV8xLCBuYS5ybSA9IFQpLAogICAgICAgICB0b3RhbF9oaXRzX3NjcmFwZV8yID0gc3VtKGhpdHNfc2NyYXBlXzIsIG5hLnJtID0gVCkpICU+JSAKICBsZWZ0X2pvaW4odHJhbnNsaXQgJT4lIHNlbGVjdChET0ksIG5hbWVfcHN5Y2luZm8pKQoKIyMgY29ycmVsYXRlIHRvdGFscwpjb3IudGVzdChwc3ljaW5mb19vdmVyYWxsJHRvdGFsX2hpdHNfc2NyYXBlXzEsIHBzeWNpbmZvX292ZXJhbGwkdG90YWxfaGl0c19zY3JhcGVfMikKcHN5Y2luZm9fb3ZlcmFsbCAlPiUgCiAgZmlsdGVyKHRvdGFsX2hpdHNfc2NyYXBlXzEgPiAwLCB0b3RhbF9oaXRzX3NjcmFwZV8yID4gMCkgJT4lIAogIHN1bW1hcmlzZShjb3IodG90YWxfaGl0c19zY3JhcGVfMSwgdG90YWxfaGl0c19zY3JhcGVfMikpCgoKIyMgY29ycmVsYXRlIGJ5IHllYXIsIGRpZmZzLCBwcm9wb3J0aW9ucwpjb3IudGVzdChwc3ljaW5mbyRoaXRzX3NjcmFwZV8xLCBwc3ljaW5mbyRoaXRzX3NjcmFwZV8yKQpwc3ljaW5mbyAlPiUgIG11dGF0ZShkaWZmID0gaGl0c19zY3JhcGVfMiAtIGhpdHNfc2NyYXBlXzEpICU+JSBwdWxsKGRpZmYpICU+JSBhYnMoKSAlPiUgbWVhbihuYS5ybT1UKQpwc3ljaW5mbyAlPiUgIG11dGF0ZShwcm9wID0gaGl0c19zY3JhcGVfMi8gaGl0c19zY3JhcGVfMSkgJT4lIHB1bGwocHJvcCkgJT4lICBxcGxvdCgpICsgc2NhbGVfeF9sb2cxMCgpCnBzeWNpbmZvICU+JSAgbXV0YXRlKGRpZmYgPSBoaXRzX3NjcmFwZV8yIC0gaGl0c19zY3JhcGVfMSkgJT4lIHB1bGwoZGlmZikgJT4lICBtZWFuKG5hLnJtPVQpCiMgcHN5Y2luZm8gJT4lIGZpbHRlcihoaXRzX3NjcmFwZV8xID4gaGl0c19zY3JhcGVfMikgJT4lIHNlbGVjdChET0ksIFllYXIsIG5hbWVfcHN5Y2luZm8sIE5hbWVPQywgaGl0c19zY3JhcGVfMSwgaGl0c19zY3JhcGVfMikgJT4lIG11dGF0ZShkaWZmID0gaGl0c19zY3JhcGVfMiAtIGhpdHNfc2NyYXBlXzEpICU+JSBhcnJhbmdlKGRpZmYpICU+JSBWaWV3KCkKCnBzeWNpbmZvICU+JSBmaWx0ZXIoaGl0c19zY3JhcGVfMSA8IGhpdHNfc2NyYXBlXzIpICU+JSBucm93KCkKcHN5Y2luZm8gJT4lICBtdXRhdGUoZGlmZiA9IGhpdHNfc2NyYXBlXzIgLSBoaXRzX3NjcmFwZV8xKSAlPiUgcHVsbChkaWZmKSAlPiUgdGFibGUoKSAlPiUgc29ydCgpCgojIHBzeWNpbmZvICU+JSBmaWx0ZXIoaGl0c19zY3JhcGVfMSA8IGhpdHNfc2NyYXBlXzIpICU+JSBzZWxlY3QoRE9JLCBZZWFyLCBuYW1lX3BzeWNpbmZvLCBOYW1lT0MsIGhpdHNfc2NyYXBlXzEsIGhpdHNfc2NyYXBlXzIpICU+JSBtdXRhdGUoZGlmZiA9IGhpdHNfc2NyYXBlXzIgLSBoaXRzX3NjcmFwZV8xKSAlPiUgYXJyYW5nZShkaWZmKSAlPiUgVmlldygpCmBgYAoKCiMjIFRvcCBUZXN0cyBpbiBlYWNoCgojIyMgT25seSBpbiBQc3ljSW5mbyBTY3JhcGUgMQpgYGB7ciBjb2xzLnByaW50PTN9CnBzeWNpbmZvX292ZXJhbGwgJT4lIAogIHVuZ3JvdXAoKSAlPiUgCiAgZmlsdGVyKHRvdGFsX2hpdHNfc2NyYXBlXzEgPiAwLAogICAgICAgICB0b3RhbF9oaXRzX3NjcmFwZV8yID09IDApICU+JSAKICBzdW1tYXJpc2UobigpLCBzdW0odG90YWxfaGl0c19zY3JhcGVfMSksIHN1bSh0b3RhbF9oaXRzX3NjcmFwZV8xKS9uKCkpCgpvcHRpb25zKGNvbHMubWluLnByaW50ID0gMiwgY29scy5wcmludCA9IDIpCmBgYAoKCmBgYHtyIGNvbHMubWluLnByaW50PTJ9CnBzeWNpbmZvX292ZXJhbGwgJT4lIAogIHVuZ3JvdXAoKSAlPiUgCiAgIyBmaWx0ZXIoaXMubmEoRE9JKSkgJT4lCiAgZmlsdGVyKHRvdGFsX2hpdHNfc2NyYXBlXzIgPT0gMCwgdG90YWxfaGl0c19zY3JhcGVfMSA+PSAxKSAlPiUgCiAgYXJyYW5nZShkZXNjKHRvdGFsX2hpdHNfc2NyYXBlXzEpKSAlPiUgCiAgc2VsZWN0KG5hbWVfcHN5Y2luZm8sIHRvdGFsX2hpdHNfc2NyYXBlXzEpICU+JSAKICBhcnJhbmdlKGRlc2ModG90YWxfaGl0c19zY3JhcGVfMSkpICU+JSAKICBEVDo6ZGF0YXRhYmxlKCkKYGBgCgojIyMgT25seSBpbiBQc3ljVGVzdHMgU2NyYXBlIDIKYGBge3IgY29scy5wcmludD0zfQpwc3ljaW5mb19vdmVyYWxsICU+JSAKICB1bmdyb3VwKCkgJT4lIAogIGZpbHRlcih0b3RhbF9oaXRzX3NjcmFwZV8xID09IDAsCiAgICAgICAgIHRvdGFsX2hpdHNfc2NyYXBlXzIgPiAwKSAlPiUgCiAgc3VtbWFyaXNlKG4oKSwgc3VtKHRvdGFsX2hpdHNfc2NyYXBlXzIpLCBzdW0odG90YWxfaGl0c19zY3JhcGVfMikvbigpKQpgYGAKCgpgYGB7ciBjb2xzLm1pbi5wcmludD0yfQpwc3ljaW5mb19vdmVyYWxsICU+JSAKICB1bmdyb3VwKCkgJT4lIAogIGZpbHRlcih0b3RhbF9oaXRzX3NjcmFwZV8xID09IDAsIHRvdGFsX2hpdHNfc2NyYXBlXzIgPj0gMSkgJT4lIAogICMgZmlsdGVyKCFpcy5uYShET0kpLCBpcy5uYSh0b3RhbF9oaXRzX3NjcmFwZV8xKSB8IHRvdGFsX2hpdHNfc2NyYXBlXzEgPT0gMCkgJT4lIAogIGRyb3BfbmEobmFtZV9wc3ljaW5mbywgdG90YWxfaGl0c19zY3JhcGVfMikgJT4lIAogIGFycmFuZ2UoZGVzYyh0b3RhbF9oaXRzX3NjcmFwZV8yKSkgJT4lIAogIHNlbGVjdCggbmFtZV9wc3ljaW5mbywgdG90YWxfaGl0c19zY3JhcGVfMikgJT4lIAogIERUOjpkYXRhdGFibGUoKQpgYGAKCgojIyMgSGl0cyBvbmx5IGluIHNjcmFwZSAxLCBldmVuIHRob3VnaCB3ZSBoYXZlIGEgbWF0Y2ggZm9yIHRoZSBuYW1lCmBgYHtyfQpwc3ljaW5mb19vdmVyYWxsICU+JSAKICB1bmdyb3VwKCkgJT4lIAogIGZpbHRlcighaXMubmEoRE9JKSwKICAgICAgICAgdG90YWxfaGl0c19zY3JhcGVfMSA+IDAsCiAgICAgICAgIHRvdGFsX2hpdHNfc2NyYXBlXzIgPT0gMCkgJT4lIAogIHN1bW1hcmlzZShuKCksIHN1bSh0b3RhbF9oaXRzX3NjcmFwZV8xKSwgc3VtKHRvdGFsX2hpdHNfc2NyYXBlXzEpL24oKSkKYGBgCgojIyMgSGl0cyBvbmx5IGluIHNjcmFwZSAxIHdpdGhvdXQgYSBjbGVhciBtYXRjaCBmb3IgdGhlIG5hbWUKYGBge3J9CnBzeWNpbmZvX292ZXJhbGwgJT4lIAogIHVuZ3JvdXAoKSAlPiUgCiAgZmlsdGVyKGlzLm5hKERPSSksCiAgICAgICAgIHRvdGFsX2hpdHNfc2NyYXBlXzEgPiAwLAogICAgICAgICB0b3RhbF9oaXRzX3NjcmFwZV8yID09IDApICU+JSAKICBzdW1tYXJpc2UobigpLCBzdW0odG90YWxfaGl0c19zY3JhcGVfMSksIHN1bSh0b3RhbF9oaXRzX3NjcmFwZV8xKS9uKCkpCmBgYAoKIyMgTWVyZ2UgU2NyYXBlIDEgYW5kIDIKYGBge3J9CnBzeWNpbmZvX3NjcmFwZV8xX3dpdGhvdXRfaGl0c19pbl8yIDwtIHBzeWNpbmZvX292ZXJhbGwgJT4lIAogICAgdW5ncm91cCgpICU+JSAKICAgIGZpbHRlcih0b3RhbF9oaXRzX3NjcmFwZV8xID4gMCwgaXMubmEodG90YWxfaGl0c19zY3JhcGVfMikgfCB0b3RhbF9oaXRzX3NjcmFwZV8yID09IDApICU+JSAKICAgIHNlbGVjdChET0ksIG5hbWVfcHN5Y2luZm8pICU+JSAKICAgIGRpc3RpbmN0KG5hbWVfcHN5Y2luZm8sIC5rZWVwX2FsbCA9IFRSVUUpICU+JSAKICAgIGxlZnRfam9pbihwc3ljaW5mb19zY3JhcGVfYnlfam91cm5hbCAlPiUgCiAgICAgcmVuYW1lKG5hbWVfcHN5Y2luZm8gPSBOYW1lLCBIaXRzID0gdXNhZ2VfY291bnQpLCBieSA9ICJuYW1lX3BzeWNpbmZvIiwgbXVsdGlwbGUgPSAiYWxsIikgJT4lIAogICAgbXV0YXRlKERPSSA9IGNvYWxlc2NlKERPSSwgbmFtZV9wc3ljaW5mbykpICU+JSAKICAgIGdyb3VwX2J5KERPSSkgJT4lIAogICAgbXV0YXRlKGZpcnN0X3B1Yl95ZWFyID0gbWluKFllYXIsIG5hLnJtID0gVCksCiAgICAgICAgICAgbGFzdF9wdWJfeWVhciA9IG1heChZZWFyLCBuYS5ybSA9IFQpLAogICAgICAgICAgIHRvdGFsX2hpdHMgPSBzdW0oSGl0cywgbmEucm0gPSBUKSkgJT4lIAogIHVuZ3JvdXAoKQoKcHN5Y2luZm9fc2NyYXBlXzFfd2l0aG91dF9oaXRzX2luXzIgJT4lIAogIHN1bW1hcmlzZShuX2Rpc3RpbmN0KERPSSksIHN1bShIaXRzKSwgc3VtKEhpdHMpL25fZGlzdGluY3QoRE9JKSkKCnBzeWNpbmZvX2J5X2RvaV93aXRoX2hpdHMgPC0gcHN5Y2luZm9fYnlfZG9pICU+JQogIGRyb3BfbmEoSGl0cywgWWVhcikgJT4lIAogIGFudGlfam9pbihwc3ljaW5mb19vdmVyYWxsICU+JSBmaWx0ZXIodG90YWxfaGl0c19zY3JhcGVfMiA9PSAwKSAlPiUgc2VsZWN0KERPSSksIGJ5ID0gIkRPSSIpICU+JSAKICBsZWZ0X2pvaW4odHJhbnNsaXQgJT4lIHNlbGVjdChET0ksIG5hbWVfcHN5Y2luZm8pLCBieSA9ICJET0kiKQpzdW0oaXMubmEocHN5Y2luZm9fYnlfZG9pX3dpdGhfaGl0cyRuYW1lX3BzeWNpbmZvKSkKc3VtKCFpcy5uYShwc3ljaW5mb19ieV9kb2lfd2l0aF9oaXRzJG5hbWVfcHN5Y2luZm8pKQoKcHN5Y2luZm9fYnlfZG9pX3dpdGhfaGl0cyAlPiUgCiAgc3VtbWFyaXNlKG5fZGlzdGluY3QoRE9JKSwgc3VtKEhpdHMsIG5hLnJtID0gVCksIHN1bShIaXRzLCBuYS5ybSA9IFQpL25fZGlzdGluY3QoRE9JKSkKCnBzeWNpbmZvX21lcmdlZCA8LSBiaW5kX3Jvd3MoCiAgc2NyYXBlXzIgPSBwc3ljaW5mb19ieV9kb2lfd2l0aF9oaXRzLCAKICBzY3JhcGVfMSA9IHBzeWNpbmZvX3NjcmFwZV8xX3dpdGhvdXRfaGl0c19pbl8yLCAuaWQgPSAic291cmNlIikKCnBzeWNpbmZvX21lcmdlZCAlPiUgCiAgc3VtbWFyaXNlKG5fZGlzdGluY3QoRE9JKSwgc3VtKEhpdHMsIG5hLnJtID0gVCksIHN1bShIaXRzLCBuYS5ybSA9IFQpL25fZGlzdGluY3QoRE9JKSkKCnNhdmVSRFMocHN5Y2luZm9fbWVyZ2VkLCAiLi4vc29iZXJfcnVicmljL3Jhd19kYXRhL3BzeWNpbmZvX21lcmdlZF9zY3JhcGVfMV9hbmRfMi5yZHMiKQpgYGAKCgojIyBKb2ludCB0b3AgbGlzdAoKYGBge3J9CnBzeWNpbmZvX21lcmdlZCAlPiUgCiAgZ3JvdXBfYnkoRE9JLCBuYW1lX3BzeWNpbmZvLCBzb3VyY2UpICU+JQogIHN1bW1hcmlzZSh0b3RhbF9oaXRzID0gc3VtKEhpdHMsIG5hLnJtICA9IFQpKSAlPiUgCiAgYXJyYW5nZShkZXNjKHRvdGFsX2hpdHMpKSAlPiUgCiAgdW5ncm91cCgpICU+JSAKICBzZWxlY3QoIHNvdXJjZSwgbmFtZV9wc3ljaW5mbywgdG90YWxfaGl0cykgJT4lIAogIERUOjpkYXRhdGFibGUoKQpgYGAKCg==